In this documents the IPO Prospectus PDF-files will be read into text data, processable in Python. Afterwards it will be processed, cleaned and evaluated.¶
Load required packages...
import os, re
import pandas as pd
import numpy as np
import bs4
from pdfminer.high_level import extract_text
import tqdm
import gc
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
import en_core_web_sm
nlp = en_core_web_sm.load(disable=['tok2vec','tagger','attribute_ruler','lemmatizer','ner','parser'])
nlp.enable_pipe("senter")
DATA GENERATION: Create a dictionary of all file names in folder
path = "C:\\NLP\\Prospectus\\Risk Faktors\\"
os.chdir(path)
Text_data = {'Issuer/Borrower PermID': [] , 'content': [] }
DATA GENERATION: Store text into dictionary:
for file in tqdm.tqdm(os.listdir()):
if file.endswith(".pdf"):
try:
text = extract_text(path + file)
text = u' '.join(text.split())
Text_data['content'].append(text)
Text_data['Issuer/Borrower PermID'].append(file.split('.', 1)[0])
del text
del soup
gc.collect()
except:
pass
if file.endswith(".htm") or file.endswith(".html"):
try:
with open(path + file ,"r",encoding="utf-8") as f:
text = f.read()
soup=bs4.BeautifulSoup(text)
text = u' '.join(soup.get_text().split())
Text_data['content'].append(text)
Text_data['Issuer/Borrower PermID'].append(file.split('.', 1)[0])
del text
del soup
gc.collect()
except:
pass
else:
continue
100%|██████████| 222/222 [22:39<00:00, 6.13s/it]
csv_save = pd.DataFrame(Text_data)
csv_save.to_csv('Text_data.csv', encoding='utf-8',index=False)
Further processing of text:
Split into sentences and count number of words:
Text_data = pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Text_data.csv')
nlp.max_length = 6500000
processed_text = { 'Issuer/Borrower PermID': [] , 'content': [] }
processed_text['Issuer/Borrower PermID'] = Text_data['Issuer/Borrower PermID']
Clean the text:
for i in tqdm.tqdm(range(len(Text_data['content']))):
if pd.isnull(Text_data['content'][i]):
processed_text['content'].append('nan')
else:
processed_text['content'].append(nlp(
re.sub(r'\d+\.?\,?\d*\s{1}\d+\.?\,?\d*',"", ## Leer zwischen Dezimalzahlen entfernen
re.sub(r'(\. )([a-z])',r' \2', re.sub(r'\.{2,}',' ', re.sub(r'\d\.\s+|\b[a-z]\)\s+|•\s+|●\s+|\[[^\]]*\]|○\s+|[A-Z]\.\s+|[IVX]+\.\s+/g', ". " , re.sub(r"\S*https?:\S*", "",
re.sub("([\u4e00-\u9fa5])+","", Text_data['content'][i])).replace(' . ', '.'))).replace(';','.')).replace('-'," ").replace(" ", " ").replace(". , ",", ")).replace('(', '').replace(')', '').replace('“', '')\
.replace('”', '').replace('‘', '').replace('’', '').replace('`', '').replace('− ', '').replace('- ', '').replace('_', '').replace(' ', ' ')\
.replace('cid','').replace('\\', '')
))
100%|██████████| 221/221 [00:32<00:00, 6.88it/s]
csv_save_processed = pd.DataFrame(processed_text)
csv_save_processed.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', encoding='utf-8',index=False)
START HERE WHEN USING ALREADY CLEANED EXCEL DATA
csv_save_processed = pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', converters={'content': lambda x: str(x)})
NLP_results = pd.DataFrame()
NLP_results['filename'] = csv_save_processed['filename']
NLP_results.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)
csv_save_processed.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)
Add the number of word as a variable
words=[]
for i in range(len(csv_save_processed['content'])):
words.append(len(csv_save_processed['content'][i]))
NLP_results['words'] = words
del words
Devide the whole text into sentences
nlp.max_length = 6500000
sentences = []
for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
sentences.append([sent.text.strip() for sent in nlp(csv_save_processed['content'][i]).sents])
100%|██████████| 221/221 [00:38<00:00, 5.68it/s]
csv_save_processed['sentences']=sentences
Count the number of sentences
no_sent=[]
for i in range(len(sentences)):
no_sent.append(len(sentences[i]))
NLP_results['sentences'] = no_sent
del no_sent
#csv_save_processed.to_csv('Processed_Text.csv', encoding='utf-8',index=False)
#csv_save_processed=pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\Processed_Text.csv', converters={'content': lambda x: str(x)})
Start Natural-Language-Processing¶
Load the list of COVID-Related Terms found on the internet and provided by UVA Health
cov_terms = pd.read_excel(r'H:\IPO Sentiment\Code\Data\Covid_Related_Terms (1).xlsx', skiprows=[0,1], converters={'TERM': lambda x: str(x)})
csv_save_processed['content'][12] = np.nan
csv_save_processed['content'][217] = np.nan
Count the amount of COVID-related words
cov=[]
add=0
for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
if pd.isnull(csv_save_processed['content'][i]):
senti= np.nan
cov.append(senti)
else:
for k in range(len(cov_terms)):
count= len(re.findall(r"\b" + re.escape(cov_terms['TERM'][k].lower()) + r"\b", csv_save_processed['content'][i].lower())) #csv_save_processed['content'][i].lower().count(" " + cov_terms['TERM'][k].lower() + " ")
add = add + count
senti = add/NLP_results['words'][i]
cov.append(senti)
NLP_results['pandemic words'] = cov
100%|██████████| 221/221 [22:41<00:00, 6.16s/it]
#NLP_results.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results_pandemic_words.csv', encoding='utf-8',index=False)
The Loughran McDonald Dictionary is a well implemented dictionary in NLP listing a range of words reflecting e.g. uncertainty, complexity or constraints. Those words can be counted as well.
L_Mc_dict = pd.read_excel('H:\\IPO Sentiment\\Code\\Data\\LoughranMcDonald_MasterDictionary_2020.xlsx')
L_Mc_dict['Word']=L_Mc_dict['Word'].astype(str)
negative = L_Mc_dict[L_Mc_dict["Negative"]!=0]['Word'].str.lower().tolist()
positive = L_Mc_dict[L_Mc_dict["Positive"]!=0]['Word'].str.lower().tolist()
uncertainty = L_Mc_dict[L_Mc_dict["Uncertainty"]!=0]['Word'].str.lower().tolist()
litigious = L_Mc_dict[L_Mc_dict["Litigious"]!=0]['Word'].str.lower().tolist()
strong_modal = L_Mc_dict[L_Mc_dict["Strong_Modal"]!=0]['Word'].str.lower().tolist()
weak_modal = L_Mc_dict[L_Mc_dict["Weak_Modal"]!=0]['Word'].str.lower().tolist()
constraining = L_Mc_dict[L_Mc_dict["Constraining"]!=0]['Word'].str.lower().tolist()
complexity = L_Mc_dict[L_Mc_dict["Complexity"]!=0]['Word'].str.lower().tolist()
negate = ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't",
"couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt",
"neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't",
"never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "wasnt",
"werent", "oughtn't", "shan't", "shouldn't", "wasn't", "weren't", "without", "wont", "wouldnt", "won't",
"wouldn't", "rarely", "seldom", "despite", "no", "nobody"]
senti_dict = {'Negative': negative , 'Positive': positive , 'Uncertainty': uncertainty, 'Constraining': constraining}
How many percent of uncertain words are part of negative words?¶
def negated(word):
if word.lower() in negate:
return True
else:
return False
def tone_count_with_negation_check(dict, article):
"""
Count positive and negative words with negation check. Account for simple negation only for positive words.
Simple negation is taken to be observations of one of negate words occurring within three words
preceding a positive words.
"""
pos_count = 0
neg_count = 0
pos_words = []
neg_words = []
input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
word_count = len(input_words)
for i in range(0, word_count):
if input_words[i] in dict['Negative']:
neg_count += 1
neg_words.append(input_words[i])
if input_words[i] in dict['Positive']:
if i >= 3:
if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 2:
if negated(input_words[i - 1]) or negated(input_words[i - 2]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 1:
if negated(input_words[i - 1]):
neg_count += 1
neg_words.append(input_words[i] + ' (with negation)')
else:
pos_count += 1
pos_words.append(input_words[i])
elif i == 0:
pos_count += 1
pos_words.append(input_words[i])
#print('The results with negation check:', end='\n\n')
#print('The # of positive words:', pos_count)
#print('The # of negative words:', neg_count)
#print('The list of found positive words:', pos_words)
#print('The list of found negative words:', neg_words)
#print('\n', end='')
results = [word_count, pos_count, neg_count, pos_words, neg_words]
return results
def uncert_constrain_count_with_negation_check(dict, article):
"""
Count positive and negative words with negation check. Account for simple negation only for positive words.
Simple negation is taken to be observations of one of negate words occurring within three words
preceding a positive words.
"""
const_count = 0
uncert_count = 0
const_words = []
uncert_words = []
input_words = re.findall(r'\b([a-zA-Z]+n\'t|[a-zA-Z]+\'s|[a-zA-Z]+)\b', article.lower())
word_count = len(input_words)
for i in range(0, word_count):
if input_words[i] in dict['Uncertainty']:
if i >= 3:
if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
continue
else:
uncert_count += 1
uncert_words.append(input_words[i])
elif i == 2:
if negated(input_words[i - 1]) or negated(input_words[i - 2]):
continue
else:
uncert_count += 1
uncert_words.append(input_words[i])
elif i == 1:
if negated(input_words[i - 1]):
continue
else:
uncert_count += 1
uncert_words.append(input_words[i])
elif i == 0:
uncert_count += 1
uncert_words.append(input_words[i])
if input_words[i] in dict['Constraining']:
if i >= 3:
if negated(input_words[i - 1]) or negated(input_words[i - 2]) or negated(input_words[i - 3]):
continue
else:
const_count += 1
const_words.append(input_words[i])
elif i == 2:
if negated(input_words[i - 1]) or negated(input_words[i - 2]):
continue
else:
const_count += 1
const_words.append(input_words[i])
elif i == 1:
if negated(input_words[i - 1]):
continue
else:
const_count += 1
const_words.append(input_words[i])
elif i == 0:
const_count += 1
const_words.append(input_words[i])
#print('The results with negation check:', end='\n\n')
#print('The # of positive words:', pos_count)
#print('The # of negative words:', neg_count)
#print('The list of found positive words:', pos_words)
#print('The list of found negative words:', neg_words)
#print('\n', end='')
results = [word_count, uncert_count, const_count, uncert_words, const_words]
return results
csv_save_processed.drop([12,217], inplace=True)
NLP_results.drop([12,217], inplace=True)
csv_save_processed.reset_index(inplace=True)
NLP_results.reset_index(inplace=True)
senti_res = []
for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
senti_res.append(tone_count_with_negation_check(senti_dict, csv_save_processed['content'][i]))
100%|██████████| 219/219 [04:07<00:00, 1.13s/it]
uncert_const_res = []
for i in tqdm.tqdm(range(len(csv_save_processed['content']))):
uncert_const_res.append(uncert_constrain_count_with_negation_check(senti_dict, csv_save_processed['content'][i]))
100%|██████████| 219/219 [00:36<00:00, 6.03it/s]
NLP_results['positive'] = pd.DataFrame( senti_res[0:], columns=['words','pos','neg','pos_words','neg_words'])['pos']
NLP_results['negative'] = pd.DataFrame( senti_res[0:], columns=['words','pos','neg','pos_words','neg_words'])['neg']
NLP_results['uncertainty'] = pd.DataFrame( uncert_const_res[0:], columns=['words','uncertain','constraining','uncert_words','const_words'])['uncertain']
NLP_results['constraining'] = pd.DataFrame( uncert_const_res[0:], columns=['words','uncertain','constraining','uncert_words','const_words'])['constraining']
NLP_results['sentiment'] = (NLP_results['positive'] - NLP_results['negative']) / NLP_results['words']
NLP_results['positive'] = NLP_results['positive'] / NLP_results['words']
NLP_results['negative'] = NLP_results['negative'] / NLP_results['words']
NLP_results['uncertainty'] = NLP_results['uncertainty'] / NLP_results['words']
NLP_results['constraining'] = NLP_results['constraining'] / NLP_results['words']
#NLP_results.to_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results.csv', encoding='utf-8',index=False)
#NLP_results=pd.read_csv(r'C:\NLP\Prospectus\Risk Faktors\data\NLP_results.csv')
Visualizse COVID measure and Sentiment
IPOs = pd.read_excel("H:\\IPO Sentiment\\Code\\Data\\IPOs since 2017.xlsx", converters={'CIK': lambda x: str(x)})
NLP_results = NLP_results.merge(IPOs[['Issuer/Borrower PermID', 'Issuer/Borrower Nation']], on='Issuer/Borrower PermID')
import matplotlib.pyplot as plt
NLP_results['Issuer/Borrower Nation'].unique()
array(['Germany', 'India', 'Norway', 'Hong Kong', 'Singapore', 'Spain',
'United Kingdom', 'United Arab Emirates', 'Bahrain',
'China (Mainland)', 'France', 'United States', 'Finland',
'Denmark', 'Guernsey', 'Portugal', 'Cambodia', 'Netherlands',
'Turkey', 'Sweden', 'Saudi Arabia', 'South Korea', 'Austria',
'Russia', 'Iceland', 'Ireland', 'Bangladesh', 'Malaysia',
'Australia', 'Luxembourg', 'Qatar', 'Chile', 'Canada',
'Cayman Islands', 'Jersey', 'Switzerland'], dtype=object)
region = []
for i in range(len(NLP_results)):
if NLP_results['Issuer/Borrower Nation'][i] == 'South Africa' or NLP_results['Issuer/Borrower Nation'][i] == 'Ghana' or NLP_results['Issuer/Borrower Nation'][i] == 'Tanzania' or NLP_results['Issuer/Borrower Nation'][i] == 'Togo'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Ivory Coast' or NLP_results['Issuer/Borrower Nation'][i] == 'Nigeria' or NLP_results['Issuer/Borrower Nation'][i] == 'Morocco' or NLP_results['Issuer/Borrower Nation'][i] == 'Mozambique'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Namibia' or NLP_results['Issuer/Borrower Nation'][i] == 'Uganda' or NLP_results['Issuer/Borrower Nation'][i] == 'Malawi' or NLP_results['Issuer/Borrower Nation'][i] == 'Tunisia'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Zambia' or NLP_results['Issuer/Borrower Nation'][i] == 'Rwanda' or NLP_results['Issuer/Borrower Nation'][i] == 'Mauritius':
region.append('Africa')
elif NLP_results['Issuer/Borrower Nation'][i] == 'Japan' or NLP_results['Issuer/Borrower Nation'][i] == 'China (Mainland)' or NLP_results['Issuer/Borrower Nation'][i] == 'Hong Kong' or NLP_results['Issuer/Borrower Nation'][i] == 'South Korea'\
or NLP_results['Issuer/Borrower Nation'][i] == 'India' or NLP_results['Issuer/Borrower Nation'][i] == 'Thailand' or NLP_results['Issuer/Borrower Nation'][i] == 'Singapore' or NLP_results['Issuer/Borrower Nation'][i] == 'Indonesia'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Vietnam' or NLP_results['Issuer/Borrower Nation'][i] == 'Philippines' or NLP_results['Issuer/Borrower Nation'][i] == 'Malaysia' or NLP_results['Issuer/Borrower Nation'][i] == 'Taiwan'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Maldives' or NLP_results['Issuer/Borrower Nation'][i] == 'Cambodia' or NLP_results['Issuer/Borrower Nation'][i] == 'Nepal' or NLP_results['Issuer/Borrower Nation'][i] == 'Bangladesh'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Macau' or NLP_results['Issuer/Borrower Nation'][i] == 'Sri Lanka' or NLP_results['Issuer/Borrower Nation'][i] == 'Pakistan' or NLP_results['Issuer/Borrower Nation'][i] == 'Mongolia'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Laos' or NLP_results['Issuer/Borrower Nation'][i] == 'Myanmar' or NLP_results['Issuer/Borrower Nation'][i] == 'Kazakhstan':
region.append('Asia')
elif NLP_results['Issuer/Borrower Nation'][i] == 'Germany' or NLP_results['Issuer/Borrower Nation'][i] == 'Poland' or NLP_results['Issuer/Borrower Nation'][i] == 'Netherlands' or NLP_results['Issuer/Borrower Nation'][i] == 'Italy'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Luxembourg' or NLP_results['Issuer/Borrower Nation'][i] == 'United Kingdom' or NLP_results['Issuer/Borrower Nation'][i] == 'Switzerland' or NLP_results['Issuer/Borrower Nation'][i] == 'Sweden'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Norway' or NLP_results['Issuer/Borrower Nation'][i] == 'France' or NLP_results['Issuer/Borrower Nation'][i] == 'Austria' or NLP_results['Issuer/Borrower Nation'][i] == 'Belgium'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Spain' or NLP_results['Issuer/Borrower Nation'][i] == 'Russia' or NLP_results['Issuer/Borrower Nation'][i] == 'Finland' or NLP_results['Issuer/Borrower Nation'][i] == 'Turkey'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Slovenia' or NLP_results['Issuer/Borrower Nation'][i] == 'Ireland' or NLP_results['Issuer/Borrower Nation'][i] == 'Denmark' or NLP_results['Issuer/Borrower Nation'][i] == 'Lithuania'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Iceland' or NLP_results['Issuer/Borrower Nation'][i] == 'Cyprus' or NLP_results['Issuer/Borrower Nation'][i] == 'Isle of Man' or NLP_results['Issuer/Borrower Nation'][i] == 'Estonia'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Portugal' or NLP_results['Issuer/Borrower Nation'][i] == 'Liechtenstein' or NLP_results['Issuer/Borrower Nation'][i] == 'Hungary' or NLP_results['Issuer/Borrower Nation'][i] == 'Romania'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Malta' or NLP_results['Issuer/Borrower Nation'][i] == 'Bulgaria' or NLP_results['Issuer/Borrower Nation'][i] == 'Czech Republic' or NLP_results['Issuer/Borrower Nation'][i] == 'Croatia'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Latvia' or NLP_results['Issuer/Borrower Nation'][i] == 'Serbia' or NLP_results['Issuer/Borrower Nation'][i] == 'Jersey':
region.append('Europe')
elif NLP_results['Issuer/Borrower Nation'][i] == 'Saudi Arabia' or NLP_results['Issuer/Borrower Nation'][i] == 'Iran' or NLP_results['Issuer/Borrower Nation'][i] == 'United Arab Emirates' or NLP_results['Issuer/Borrower Nation'][i] == 'Kuwait'\
or NLP_results['Issuer/Borrower Nation'][i] == 'India' or NLP_results['Issuer/Borrower Nation'][i] == 'Qatar' or NLP_results['Issuer/Borrower Nation'][i] == 'Oman' or NLP_results['Issuer/Borrower Nation'][i] == 'Bahrain'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Jordan' or NLP_results['Issuer/Borrower Nation'][i] == 'Israel' :
region.append('Middle East')
elif NLP_results['Issuer/Borrower Nation'][i] == 'United States' or NLP_results['Issuer/Borrower Nation'][i] == 'Canada' or NLP_results['Issuer/Borrower Nation'][i] == 'Mexico' or NLP_results['Issuer/Borrower Nation'][i] == 'Bahamas'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Bermuda' or NLP_results['Issuer/Borrower Nation'][i] == 'British Virgin Islands':
region.append('North America')
elif NLP_results['Issuer/Borrower Nation'][i] == 'Chile' or NLP_results['Issuer/Borrower Nation'][i] == 'Argentina' or NLP_results['Issuer/Borrower Nation'][i] == 'Peru' or NLP_results['Issuer/Borrower Nation'][i] == 'Brazil'\
or NLP_results['Issuer/Borrower Nation'][i] == 'Bermuda' or NLP_results['Issuer/Borrower Nation'][i] == 'British Virgin Islands':
region.append('South America')
else:
region.append('Rest')
NLP_results['region'] = region
Divide Dataset into pre- and post COVID by IPO Date¶
NLP_results = NLP_results.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
NLP_results['Issue Date'] = NLP_results['Issue Date'].apply(lambda x: x.date())
split_date = pd.datetime(2020,1,1).date()
results_pre = NLP_results.loc[NLP_results['Issue Date'] <= split_date]
results_post = NLP_results.loc[NLP_results['Issue Date'] > split_date]
import matplotlib.gridspec as gridspec
import datetime
plot_data_name= ['constraining','uncertainty','sentiment','finbert sentiment']
plot_lines= [[0.00125,0.00125],[0.0055,0.0055],[-0.004,-0.004],[-0.35,-0.35]]
percent_calc_pre = [
sum(results_pre['constraining']>=plot_lines[0][0])/len(results_pre['constraining']),
sum(results_pre['uncertainty']>=plot_lines[1][0])/len(results_pre['uncertainty']),
sum(results_pre['sentiment']>=plot_lines[2][0])/len(results_pre['sentiment']),
sum(results_pre['finbert sentiment']>=plot_lines[3][0])/len(results_pre['finbert sentiment'])
]
percent_calc_post = [
sum(results_post['constraining']>=plot_lines[0][0])/len(results_post['constraining']),
sum(results_post['uncertainty']>=plot_lines[1][0])/len(results_post['uncertainty']),
sum(results_post['sentiment']>=plot_lines[2][0])/len(results_post['sentiment']),
sum(results_post['finbert sentiment']>=plot_lines[3][0])/len(results_post['finbert sentiment'])
]
time_trends_post = pd.DataFrame(columns=["Date",'uncertainty','constraining','sentiment','finbert sentiment'])
time_trends_post['Date'] = np.sort(results_post['Issue Date'].unique())
for j in range(len(plot_data_name)):
trends = []
for i in range(len(np.sort(results_post['Issue Date'].unique()))):
trends.append(sum(results_post[results_post['Issue Date'] == np.sort(results_post['Issue Date'].unique())[i]][plot_data_name[j]])/len(results_post[results_post['Issue Date'] == np.sort(results_post['Issue Date'].unique())[i]][plot_data_name[j]]))
time_trends_post[plot_data_name[j]] = trends
rows = 4
cols = 1
fig = plt.figure(figsize=(20, 25), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)
for i in range(rows * cols):
# create fake subplot just to title pair of subplots
fake = fig.add_subplot(grid[i])
# '\n' is important
fake.set_title(['Constraining Word Ratio','Uncertainty Word Ratio', 'LMD Sentiment Ratio','Finbert Sentiment Ratio'][i], fontweight='semibold', size=16)
fake.set_axis_off()
# create subgrid for two subplots without space between them
# <https://matplotlib.org/2.0.2/users/gridspec.html>
gs = gridspec.GridSpecFromSubplotSpec(2, 2, subplot_spec=grid[i], wspace=0.05)
# real subplot #1
ax = fig.add_subplot(gs[0])
ax.set_title(['Pre-COVID','Pre-COVID','Pre-COVID', 'Pre-COVID'][i])
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=False)
ax.scatter(results_pre['Issue Date'], results_pre[plot_data_name[i]], s =35, c = 'black')
#ax.plot([min(results_pre['Issue Date']), max(results_pre['Issue Date'])], plot_lines[i])
plt.axhspan(plot_lines[i][0], [0.0021,0.0068,-0.00045,-0.15][i], facecolor='0.2', alpha=0.2)
plt.text(min(results_pre['Issue Date']), [0.0019,0.0064,-0.0011,-0.19][i], '{0:.0%}'.format(percent_calc_pre[i]) + ' of the Data', color="blue", fontsize=16)
plt.ylim(min(results_post[plot_data_name[i]])-[0.0001,0.0002,0.0005,0.02][i], max(results_post[plot_data_name[i]])+[0.0002,0.0004,0.0003,0.04][i])
plt.ylabel(['Constraining Word Ratio','Uncertainty Word Ratio', 'LMD Sentiment Ratio','Finbert Sentiment Ratio'][i])
ax = fig.add_subplot(gs[2])
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=False)
sns.histplot(data=results_pre[plot_data_name[i]], kde=True, stat='density')
plt.ylim([0,0,0,0][i], [2300,1150,650,6.2][i])
plt.xlim([0.0004,0.003,-0.0057,-0.51][i], [0.0021,0.008,-0.001,-0.165][i])
plt.ylabel("Density")
plt.xlabel("")
# real subplot #2
ax = fig.add_subplot(gs[1])
ax.set_title(['COVID','COVID','COVID','COVID'][i])
# hide ticks and labels
ax.tick_params(left=False, labelleft=False, labelbottom=True, bottom=False)
ax.scatter(results_post['Issue Date'], results_post[plot_data_name[i]], s =35, c = 'black')
#ax.plot([min(results_post['Issue Date']), max(results_post['Issue Date'])], plot_lines[i])
plt.axhspan(plot_lines[i][0], [0.0021,0.0068,-0.00045,-0.15][i], facecolor='0.2', alpha=0.2)
plt.text(min(results_post['Issue Date']+datetime.timedelta(days=4)), [0.0019,0.0064,-0.0011,-0.19][i], '{0:.0%}'.format(percent_calc_post[i]) + ' of the Data', color="blue", fontsize=16)
plt.ylim(min(results_post[plot_data_name[i]])-[0.0001,0.0002,0.0005,0.02][i], max(results_post[plot_data_name[i]])+[0.0002,0.0004,0.0003,0.04][i])
ax = fig.add_subplot(gs[3])
# hide ticks and labels
ax.tick_params(left= False, labelleft=False, labelbottom=True, bottom=False)
sns.histplot(data=results_post[plot_data_name[i]], kde=True, stat='density' )
plt.ylim([0,0,0,0][i], [2300,1150,650,6.2][i])
plt.xlim([0.0004,0.003,-0.0057,-0.51][i], [0.0021,0.008,-0.001,-0.165][i])
plt.ylabel("")
plt.xlabel("")
fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Overview of Wordcounts', fontweight='bold', size=16)
fig.tight_layout()
import seaborn as sns
plot2_axis_labels= [['LMD Constraining Ratio','Pandemic Word Ratio'],['LMD Uncertainty Ratio', 'Pandemic Word Ratio'],['LMD Sentiment Ratio','Pandemic Word Ratio']]
rows = 3
cols = 1
fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)
for i in range(rows * cols):
# create fake subplot just to title pair of subplots
fake = fig.add_subplot(grid[i])
# '\n' is important
fake.set_title(['Constraining Words:','Uncertain Words:','Sentiment:'][i], fontweight='bold', size=16, loc='left')
fake.set_axis_off()
# create subgrid for two subplots without space between them
# <https://matplotlib.org/2.0.2/users/gridspec.html>
gs = gridspec.GridSpecFromSubplotSpec(1, 1, subplot_spec=grid[i], wspace=0.05)
# real subplot #1
ax = fig.add_subplot(gs[0])
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=True, bottom=True)
ax.axis([min(results_pre['pandemic words']),max(results_pre['pandemic words']),min(results_pre[plot_data_name[i]]),max(results_pre[plot_data_name[i]])])
sns.regplot(x='pandemic words',y=plot_data_name[i], data = results_pre, line_kws={'color': 'black'})
plt.xlabel(plot2_axis_labels[i][1], size=12)
plt.ylabel(plot2_axis_labels[i][0], size=12)
fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Overview Wordcounts', fontweight='bold', size=16)
fig.tight_layout()
from scipy import stats
(0.3626434742394554, 3.2919953908863184e-08)
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['sentiment'])))
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['uncertainty'])))
print(stats.pearsonr(np.array(NLP_results['pandemic words']),np.array(NLP_results['constraining'])))
(0.3626434742394554, 3.2919953908863184e-08) (0.3168112250248351, 1.7065657902013273e-06) (-0.14130159240863932, 0.03665360879070792)
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['sentiment'])))
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['uncertainty'])))
print(stats.spearmanr(np.array(NLP_results['pandemic words']),np.array(NLP_results['constraining'])))
SpearmanrResult(correlation=0.3044901300427793, pvalue=4.438645404602572e-06) SpearmanrResult(correlation=0.24170948441895618, pvalue=0.0003059827524162565) SpearmanrResult(correlation=-0.11424135245163458, pvalue=0.09170563417495373)
f = plt.figure(figsize=(19, 15))
plt.matshow(corrMatrix.corr(), fignum=f.number)
plt.xticks(range(corrMatrix.select_dtypes(['number']).shape[1]), corrMatrix.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(corrMatrix.select_dtypes(['number']).shape[1]), corrMatrix.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);
FIN-BERT Model for Sentiment
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
sentences2 = []
for j in range(len(csv_save_processed['sentences'])):
sentences2.append([i for (i,v) in zip(csv_save_processed['sentences'][j], [x.count(' ')>400 for x in csv_save_processed['sentences'][j]]) if not v])
csv_save_processed['sentences2'] = sentences2
senti = pipeline("sentiment-analysis", model=finbert, tokenizer=tokenizer)
results = []
for i in tqdm.tqdm(range(len(csv_save_processed['sentences2']))):
results.append(senti(csv_save_processed['sentences2'][i])) #LABEL_0: neutral; LABEL_1: positive; LABEL_2: negative
100%|██████████| 219/219 [4:34:40<00:00, 75.25s/it]
pos = []
neg = []
Finbert_res = pd.DataFrame(NLP_results[['Issuer/Borrower PermID','sentences']])
for i in range(len(results)):
pos.append(len([i for i in results[i] if i['label'] == 'positive']))
for j in range(len(results)):
neg.append(len([i for i in results[j] if i['label'] == 'negative']))
Finbert_res['finbert positive'] = pos
Finbert_res['finbert negative'] = neg
Finbert_res['finbert sentiment'] = (Finbert_res['finbert positive'] - Finbert_res['finbert negative']) / Finbert_res['sentences']
%store Finbert_res
Stored 'results' (list)
%store -r Finbert_res
NLP_results = NLP_results.merge(Finbert_res[['Issuer/Borrower PermID', 'finbert sentiment']], on='Issuer/Borrower PermID')
NLP_results = NLP_results.drop([12,217])
finbert_plotdata_names1 = ['constraining','uncertainty']
finbert_plotdata_names2 = ['pandemic words', 'sentiment']
rows = 2
cols = 1
fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)
for i in range(rows * cols):
# create fake subplot just to title pair of subplots
fake = fig.add_subplot(grid[i])
# '\n' is important
#fake.set_title(finbert_plotdata_names, fontweight='semibold', size=14)
fake.set_axis_off()
# create subgrid for two subplots without space between them
# <https://matplotlib.org/2.0.2/users/gridspec.html>
gs = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=grid[i], wspace=0.1)
# real subplot #1
ax = fig.add_subplot(gs[0])
ax.set_title(['Constraining Words:','Uncertainty Words:'][i], fontweight='bold', size=16, loc='left')
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
ax.axis([min(results_post['finbert sentiment']),max(results_post['finbert sentiment']),min(results_post[finbert_plotdata_names1[i]]),max(results_post[finbert_plotdata_names1[i]])])
sns.regplot(x='finbert sentiment', y=finbert_plotdata_names1[i], data = results_post, line_kws={'color': 'black'})
plt.xlabel('Finbert Sentiment Ratio', size=12)
plt.ylabel(['LMD Constraining Ratio','LMD Uncertainty Ratio'][i], size=12)
# real subplot #2
ax = fig.add_subplot(gs[1])
ax.set_title(['Pandemic Words:','LMD Sentiment:'][i], fontweight='bold', size=16, loc='left')
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
ax.axis([min(results_post['finbert sentiment']),max(results_post['finbert sentiment']),min(results_post[finbert_plotdata_names2[i]]),max(results_post[finbert_plotdata_names2[i]])])
sns.regplot(x='finbert sentiment',y=finbert_plotdata_names2[i], data = results_post, line_kws={'color': 'black'})
plt.xlabel('Finbert Sentiment Ratio', size=12)
plt.ylabel(['Pandemic Words Ratio','LMD Sentiment'][i], size=12)
fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Comparision between Wordcounts and FinBert Sentiment', fontweight='bold', size=16)
fig.tight_layout()
finbert_plotdata_names1 = ['constraining','uncertainty']
finbert_plotdata_names2 = ['pandemic words', 'finbert sentiment']
rows = 2
cols = 1
fig = plt.figure(figsize=(20, 10), dpi=80)
# grid for pairs of subplots
grid = plt.GridSpec(rows, cols)
for i in range(rows * cols):
# create fake subplot just to title pair of subplots
fake = fig.add_subplot(grid[i])
# '\n' is important
#fake.set_title(finbert_plotdata_names, fontweight='semibold', size=14)
fake.set_axis_off()
# create subgrid for two subplots without space between them
# <https://matplotlib.org/2.0.2/users/gridspec.html>
gs = gridspec.GridSpecFromSubplotSpec(1, 2, subplot_spec=grid[i], wspace=0.1)
# real subplot #1
ax = fig.add_subplot(gs[0])
ax.set_title(['Constraining Words:','Uncertainty Words:'][i], fontweight='bold', size=16, loc='left')
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
ax.axis([min(results_post['sentiment']),max(results_post['sentiment']),min(results_post[finbert_plotdata_names1[i]]),max(results_post[finbert_plotdata_names1[i]])])
sns.regplot(x='sentiment', y=finbert_plotdata_names1[i], data = results_post, line_kws={'color': 'black'})
plt.xlabel('LMD Sentiment Ratio', size=12)
plt.ylabel(['LMD Constraining Ratio','LMD Uncertainty Ratio'][i], size=12)
# real subplot #2
ax = fig.add_subplot(gs[1])
ax.set_title(['Pandemic Words:','Finbert Sentiment:'][i], fontweight='bold', size=16, loc='left')
# hide ticks and labels
ax.tick_params(left=True, labelleft=True, labelbottom=[False,True][i], bottom=True)
ax.axis([min(results_post['sentiment']),max(results_post['sentiment']),min(results_post[finbert_plotdata_names2[i]]),max(results_post[finbert_plotdata_names2[i]])])
sns.regplot(x='sentiment',y=finbert_plotdata_names2[i], data = results_post, line_kws={'color': 'black'})
plt.xlabel('LMD Sentiment Ratio', size=12)
plt.ylabel(['Pandemic Words Ratio','Finbert Sentiment'][i], size=12)
fig.patch.set_facecolor('lightgrey')
#fig.suptitle('Comparision between Wordcounts and LMD Sentiment Sentiment', fontweight='bold', size=16)
fig.tight_layout()
Topic Model¶
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
Text_TM = csv_save_processed
Test if datatype is correct
#isinstance(item, float) for item in Text_TM['content']
#all(isinstance(item, str) for item in test)
test = []
test2 = []
for i in range(len(Text_TM)):
test.append(type(Text_TM['content'][i]))
for i in range(len(test)):
if test[i]!=str:
test2.append(i)
else:
pass
test2
[]
Text_TM[Text_TM['content'] == '']
| Issuer/Borrower PermID | content | sentences | |
|---|---|---|---|
| 12 | 4296140188 | [] | |
| 217 | 5081364438 | [] |
Drop empty rows
Text_TM= Text_TM.drop([12,217])
Text_TM= Text_TM.reindex()
Text_TM.rename(columns={'filename':'Issuer/Borrower PermID'}, inplace=True)
Text_TM = Text_TM.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
Text_TM['Issue Date'] = Text_TM['Issue Date'].apply(lambda x: x.date())
split_date = pd.datetime(2020,1,1).date()
TM_pre = Text_TM.loc[Text_TM['Issue Date'] <= split_date]
TM_post = Text_TM.loc[Text_TM['Issue Date'] > split_date]
Further pre-processing of text
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '#!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@₹€$£—–'
# cleaning master function
def clean_Text_data_TopicM(Text_data_TopicM, bigrams=False):
Text_data_TopicM = Text_data_TopicM.lower() # lower case
Text_data_TopicM = re.sub('['+my_punctuation + ']+', ' ', Text_data_TopicM) # strip punctuation
Text_data_TopicM = re.sub('([0-9]+)', '', Text_data_TopicM) # remove numbers#
Text_data_TopicM = re.sub(r'\beuro\b|\beur\b|usd|rmb|rm|renminbi|cent|million|prc', '', Text_data_TopicM) #currencies and money related terms
Text_data_TopicM = re.sub(r'hong|kong|hk|cayman|uk|indian|india|indic|\beu\b|china|germani|singapor|united|kingdom|\bus\b|china|chinese|european|mofcom', '', Text_data_TopicM) #countries or country institutions
Text_data_TopicM = re.sub(r'\bi\b | \bii\b | \biii\b| \biv\b| \bv\b| \bvi\b| \bvii\b| \bx\b| \bxi\b', '', Text_data_TopicM) #bullets
Text_data_TopicM = re.sub(r'prospectus|see|section| ordinary |\bmr\b |\bmrs\b|class|stock| shares', '', Text_data_TopicM) #frequently used words in prospectus, which could distort
Text_data_TopicM = re.sub('\s+', ' ', Text_data_TopicM) #remove double spacing
Text_data_TopicM_token_list = [word for word in Text_data_TopicM.split(' ')
if word not in my_stopwords] # remove stopwords
Text_data_TopicM_token_list = [word_rooter(word) if '#' not in word else word
for word in Text_data_TopicM_token_list] # apply word rooter
if bigrams:
Text_data_TopicM_token_list = Text_data_TopicM_token_list+[Text_data_TopicM_token_list[i]+'_'+Text_data_TopicM_token_list[i+1]
for i in range(len(Text_data_TopicM_token_list)-1)]
Text_data_TopicM = ' '.join(Text_data_TopicM_token_list)
return Text_data_TopicM
TM_pre['content'] = TM_pre.content.apply(clean_Text_data_TopicM)
TM_post['content'] = TM_post.content.apply(clean_Text_data_TopicM)
#second cleaning step
def clean_Text_data_TopicM_2(Text_data_TopicM, bigrams=False):
Text_data_TopicM = re.sub( r"\ber\b | \bu\b | \bha\b| \btrt\b | \bl\b | \be\b | \bb\b | \bc\b" , '', Text_data_TopicM)
return Text_data_TopicM
TM_pre['content'] = TM_pre.content.apply(clean_Text_data_TopicM_2)
TM_post['content'] = TM_post.content.apply(clean_Text_data_TopicM_2)
Calculate Topic Model
from sklearn.feature_extraction.text import CountVectorizer
# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')
# apply transformation
tf_pre = vectorizer.fit_transform(TM_pre['content'] ).toarray()
tf_post = vectorizer.fit_transform(TM_post['content'] ).toarray()
# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names_out()
from sklearn.decomposition import LatentDirichletAllocation
number_of_topics = 5
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)
def display_topics(model, feature_names, no_top_words):
topic_dict = {}
for topic_idx, topic in enumerate(model.components_):
topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
for i in topic.argsort()[:-no_top_words - 1:-1]]
topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
for i in topic.argsort()[:-no_top_words - 1:-1]]
return pd.DataFrame(topic_dict)
model_pre = model.fit(tf_pre)
no_top_words = 25
topics_pre= display_topics(model_pre, tf_feature_names, no_top_words)
topics_pre
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | composit | 1156.2 | liabl | 1550.5 | composit | 1355.4 | rapid | 1057.2 | liabl | 1919.0 |
| 1 | attack | 882.8 | progress | 663.5 | placement | 646.3 | defer | 861.4 | length | 1079.7 |
| 2 | possess | 487.6 | qualiti | 536.0 | discov | 568.3 | defin | 751.9 | forecast | 776.5 |
| 3 | concern | 475.8 | januari | 448.8 | pipelin | 558.9 | billion | 719.4 | clean | 556.0 |
| 4 | go | 456.1 | length | 406.5 | dee | 524.8 | code | 620.3 | monetari | 403.6 |
| 5 | faith | 356.8 | oversea | 379.4 | possess | 468.5 | gdpr | 545.1 | go | 392.6 |
| 6 | destroy | 334.1 | cancel | 378.7 | opinion | 394.5 | monetari | 512.2 | attack | 318.2 |
| 7 | burden | 312.7 | defin | 321.6 | hack | 372.9 | intang | 510.7 | defer | 315.9 |
| 8 | lesser | 265.3 | magnitud | 314.5 | bankruptci | 351.7 | invalid | 509.9 | perag | 312.1 |
| 9 | interact | 249.3 | accru | 279.6 | clearanc | 340.6 | liabl | 501.5 | amongst | 301.6 |
| 10 | memorandum | 242.3 | misappropri | 278.6 | loyalti | 304.2 | familiar | 473.6 | privaci | 298.8 |
| 11 | institut | 232.7 | equal | 277.9 | deliveri | 303.6 | perag | 435.7 | emptiv | 276.2 |
| 12 | expert | 226.0 | depreci | 277.7 | passiv | 296.7 | concern | 413.8 | composit | 272.6 |
| 13 | confi | 223.1 | confi | 257.6 | destroy | 291.9 | citizen | 410.1 | magnitud | 265.5 |
| 14 | impair | 218.5 | pool | 245.7 | negoti | 281.0 | cancel | 407.1 | principl | 259.0 |
| 15 | geani | 213.8 | fraud | 223.5 | mostli | 271.4 | equal | 379.1 | depreci | 256.8 |
| 16 | instanc | 213.7 | movement | 212.1 | confi | 260.9 | indirectli | 374.1 | equival | 242.9 |
| 17 | leakag | 209.5 | re | 209.9 | join | 257.9 | late | 372.7 | ad | 238.2 |
| 18 | outbreak | 208.9 | interact | 196.9 | intensifi | 238.9 | possess | 361.3 | contrari | 214.7 |
| 19 | insuffici | 199.8 | devic | 194.5 | regist | 235.1 | broad | 354.6 | regist | 213.9 |
| 20 | asia | 195.4 | deliveri | 170.7 | familiar | 232.9 | necessarili | 348.7 | broad | 210.7 |
| 21 | headcount | 195.0 | choos | 161.7 | efficaci | 232.8 | administr | 331.0 | intensifi | 205.9 |
| 22 | divert | 193.3 | award | 156.0 | perag | 220.9 | ad | 318.6 | mechan | 205.8 |
| 23 | notwithstand | 190.2 | contrari | 139.1 | fraud | 218.5 | dissemin | 288.1 | fraud | 204.3 |
| 24 | hn | 189.0 | coverag | 137.9 | liabl | 212.5 | featur | 280.3 | fiscal | 197.1 |
model_post = model.fit(tf_post)
topics_post = display_topics(model_post, tf_feature_names, no_top_words)
topics_post
| Topic 0 words | Topic 0 weights | Topic 1 words | Topic 1 weights | Topic 2 words | Topic 2 weights | Topic 3 words | Topic 3 weights | Topic 4 words | Topic 4 weights | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | custom | 2270.5 | properti | 3941.0 | candid | 3622.5 | project | 928.1 | combin | 3391.6 |
| 1 | bank | 1530.3 | custom | 3048.6 | patent | 3032.9 | portfolio | 715.4 | warrant | 1888.1 |
| 2 | partner | 1013.8 | enterpris | 1241.8 | clinic | 2832.2 | infrastructur | 525.6 | target | 1695.9 |
| 3 | manufactur | 886.3 | data | 1112.1 | trial | 2120.3 | energi | 504.4 | sponsor | 1019.0 |
| 4 | credit | 638.8 | reput | 1100.6 | manufactur | 1380.7 | net | 498.5 | holder | 738.9 |
| 5 | supplier | 624.6 | brand | 1067.2 | licens | 1261.0 | issuer | 478.3 | entiti | 571.6 |
| 6 | facil | 618.8 | resid | 973.7 | intellectu | 1210.7 | theshar | 461.5 | redempt | 521.1 |
| 7 | suppli | 603.3 | consum | 952.7 | properti | 1171.9 | power | 439.3 | vote | 496.1 |
| 8 | pandem | 573.5 | intellectu | 839.8 | data | 971.7 | properti | 427.7 | per | 439.9 |
| 9 | flow | 569.0 | platfo | 835.5 | collabor | 886.0 | acquir | 416.1 | acquir | 418.0 |
| 10 | disrupt | 555.7 | supplier | 754.2 | patient | 866.8 | renew | 392.6 | unit | 404.4 |
| 11 | reput | 447.2 | record | 750.0 | enterpris | 816.0 | construct | 331.0 | consumm | 388.5 |
| 12 | action | 411.1 | subsidiari | 743.7 | enforc | 715.7 | note | 308.5 | privat | 369.5 |
| 13 | end | 407.1 | administr | 739.8 | medic | 713.3 | land | 273.4 | opportun | 361.9 |
| 14 | data | 401.5 | qualiti | 721.8 | research | 694.0 | trust | 257.5 | affili | 360.9 |
| 15 | loan | 397.3 | enforc | 697.7 | test | 603.8 | facil | 256.5 | founder | 348.3 |
| 16 | fiscal | 361.0 | entiti | 677.4 | administr | 562.7 | advis | 251.7 | trust | 330.7 |
| 17 | capac | 342.2 | fee | 647.4 | program | 537.3 | electr | 242.8 | member | 326.7 |
| 18 | litig | 334.8 | contractu | 607.8 | safe | 528.6 | flow | 241.5 | offic | 314.7 |
| 19 | raw | 325.0 | social | 607.5 | resid | 466.8 | realis | 241.4 | amend | 314.6 |
| 20 | plant | 314.1 | expand | 602.4 | effort | 439.5 | effici | 226.2 | redeem | 312.7 |
| 21 | economi | 311.6 | decemb | 591.1 | court | 434.5 | target | 223.5 | conflict | 263.7 |
| 22 | theshar | 304.9 | action | 567.4 | subsidiari | 425.5 | borrow | 221.8 | theshar | 256.6 |
| 23 | proceed | 300.2 | leas | 558.5 | litig | 424.3 | counterparti | 221.8 | propos | 251.8 |
| 24 | march | 297.7 | sourc | 556.7 | circular | 423.3 | transmiss | 217.2 | proce | 250.6 |
Model Evaluation
from sklearn.model_selection import GridSearchCV
# Define Search Param
search_params = {'n_components': [1 , 3, 4 , 5 , 6 , 8 , 10], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation()
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(tf_pre)
GridSearchCV(estimator=LatentDirichletAllocation(),
param_grid={'learning_decay': [0.5, 0.7, 0.9],
'n_components': [1, 3, 4, 5, 6, 8, 10]})
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(tf_post))
result = pd.DataFrame(model.cv_results_)
# Get Log Likelyhoods from Grid Search Output
n_topics = [1 , 3, 4 , 5 , 6 , 8 , 10]
log_likelyhoods_5 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.5]
log_likelyhoods_7 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.7]
log_likelyhoods_9 = [round(result['mean_test_score'][index]) for index in range(len(result['params'])) if result['params'][index]['learning_decay']==0.9]
# Show graph
plt.figure(figsize=(16, 6), facecolor='lightgrey')
plt.plot(n_topics, log_likelyhoods_5, label='0.5', color= 'blue')
plt.plot(n_topics, log_likelyhoods_7, label='0.7', color= 'dodgerblue')
plt.plot(n_topics, log_likelyhoods_9, label='0.9', color= 'turquoise')
#plt.vlines(x=4, ymin=-618000, ymax=-604100, colors='lightgrey', ls='--', lw=2)
plt.title("Choosing Optimal LDA Model", size=16)
plt.xlabel("Number of Topics", size = 12)
plt.ylabel("Log Likelyhood Scores", size = 12)
plt.legend(title='Learning decay', loc='best', fontsize=12, title_fontsize=12)
plt.show()
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
test = np.matrix(tf_post)
panel = pyLDAvis.sklearn.prepare(best_lda_model, test, vectorizer, mds='tsne')
panel
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
lda_output = best_lda_model.transform(tf_post)
# Construct the k-means clusters
clusters = KMeans(n_clusters=15, random_state=100).fit_predict(lda_output)
# Build the Singular Value Decomposition(SVD) model
svd_model = TruncatedSVD(n_components=2) # 2 components
lda_output_svd = svd_model.fit_transform(lda_output)
# X and Y axes of the plot using SVD decomposition
x = lda_output_svd[:, 0]
y = lda_output_svd[:, 1]
# Weights for the 15 columns of lda_output, for each component
print("Component's weights: \n", np.round(svd_model.components_, 2))
# Percentage of total information in 'lda_output' explained by the two components
print("Perc of Variance Explained: \n", np.round(svd_model.explained_variance_ratio_, 2))
Component's weights: [[ 0.32 0.07 0.94 0.05] [ 0.93 0.02 -0.33 0.17]] Perc of Variance Explained: [0.23 0.36]
# Plot
plt.figure(figsize=(5, 5))
plt.scatter(x, y, c=clusters)
plt.xlabel('Component 2')
plt.xlabel('Component 1')
plt.title("Segregation of Topic Clusters", )
Text(0.5, 1.0, 'Segregation of Topic Clusters')
Print word cloud¶
from wordcloud import WordCloud
test=display_topics(model, tf_feature_names, no_top_words)
test['Topic 1 weights'] = [float(x) for x in test['Topic 1 weights']]
tuples = [tuple(x) for x in test[['Topic 1 words', 'Topic 1 weights']].values]
def black_color_func(word, font_size, position,orientation,random_state=None, **kwargs):
return("hsl(0,100%, 1%)")
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black')
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(tuples))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
Analyze the counted LMD words¶
senti_res= pd.DataFrame(senti_res, columns = ['words','positive','negative', 'positive_words', 'negative_words'])
uncert_const_res = pd.DataFrame(uncert_const_res, columns = ['words','uncertain','constraining', 'uncertain_words', 'constraining_words'])
senti_res['Issuer/Borrower PermID'] = csv_save_processed['Issuer/Borrower PermID'].drop([12,217])
uncert_const_res['Issuer/Borrower PermID'] = csv_save_processed['Issuer/Borrower PermID'].drop([12,217])
senti_res = senti_res.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
uncert_const_res = uncert_const_res.merge(IPOs[['Issuer/Borrower PermID', 'Issue Date']], on='Issuer/Borrower PermID')
senti_res['Issue Date'] = senti_res['Issue Date'].apply(lambda x: x.date())
uncert_const_res['Issue Date'] = uncert_const_res['Issue Date'].apply(lambda x: x.date())
split_date = pd.datetime(2020,1,1).date()
senti_res_pre = senti_res.loc[senti_res['Issue Date'] <= split_date]
senti_res_post = senti_res.loc[senti_res['Issue Date'] > split_date]
uncert_const_res_pre = uncert_const_res.loc[uncert_const_res['Issue Date'] <= split_date]
uncert_const_res_post = uncert_const_res.loc[uncert_const_res['Issue Date'] > split_date]
import itertools
senti_words_pre = pd.DataFrame(itertools.chain.from_iterable(senti_res_pre['positive_words']), columns = ['positive_words'])
senti_words_pre['negative_words'] = pd.DataFrame(itertools.chain.from_iterable(senti_res_pre['negative_words']))
other_words_pre = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_pre['uncertain_words']), columns = ['uncertain_words'])
other_words_pre['constraining_words'] = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_pre['constraining_words']))
senti_words_post = pd.DataFrame(itertools.chain.from_iterable(senti_res_post['positive_words']), columns = ['positive_words'])
senti_words_post['negative_words'] = pd.DataFrame(itertools.chain.from_iterable(senti_res_post['negative_words']))
other_words_post = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_post['uncertain_words']), columns = ['uncertain_words'])
other_words_post['constraining_words'] = pd.DataFrame(itertools.chain.from_iterable(uncert_const_res_post['constraining_words']))
other_words_pre['constraining_words']= other_words_post['constraining_words'].replace(np.nan,"")
other_words_post['constraining_words']= other_words_post['constraining_words'].replace(np.nan,"")
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
senti_words_pre = senti_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
senti_words_post = senti_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
other_words_pre = other_words_pre.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
other_words_post = other_words_post.apply(lambda x: [word_rooter(word) if '#' not in word else word for word in x ])
pos_words_freq_pre = senti_words_pre['positive_words'].value_counts()
neg_words_freq_pre = senti_words_pre['negative_words'].value_counts()
uncert_words_freq_pre = other_words_pre['uncertain_words'].value_counts()
constr_words_freq_pre = other_words_pre['constraining_words'].value_counts()
pos_words_freq_post = senti_words_post['positive_words'].value_counts()
neg_words_freq_post = senti_words_post['negative_words'].value_counts()
uncert_words_freq_post = other_words_post['uncertain_words'].value_counts()
constr_words_freq_post = other_words_post['constraining_words'].value_counts()
uncert_words_freq_post = uncert_words_freq_post.drop(labels=['may','could','risk','might','uncertainti','depend','possibl','fluctuat'])
uncert_words_freq_pre = uncert_words_freq_pre.drop(labels=['may','could','risk','might','uncertainti','depend','possibl','fluctuat'])
neg_words_freq_pre = neg_words_freq_pre.drop(labels=['adver','failur','loss','delay','termin','unabl'])
neg_words_freq_post = neg_words_freq_post.drop(labels=['adver','failur','loss','delay','termin','unabl'])
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black', dpi = 400)
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(neg_words_freq_post))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
plt.figure(figsize=(10,8),facecolor = 'white', edgecolor='black', dpi = 400)
wordcloud = WordCloud( background_color="white", width=3000, height=2000, max_words=500).generate_from_frequencies(dict(neg_words_freq_pre))
wordcloud.recolor(color_func = black_color_func)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
Analyze Text Similarity (NOT IN PAPER)¶
from sklearn.feature_extraction.text import CountVectorizer
LemVectorizer = CountVectorizer(tokenizer=tokenizer, stop_words='english')
LemVectorizer.fit_transform(Text_TM['content'])
LemVectorizer.vocabulary_
{'input_ids': 1, 'token_type_ids': 2, 'attention_mask': 0}
type(Text_TM['content'])
pandas.core.series.Series
!jupyter nbconvert --to html NLP_Part_Risk_Factors.ipynb